# Pakete für Data Wrangling und Visualisierung
library(tidyverse)
library(rsample)
library(hablar)
# Pakete für das HTML
library(bookdown)
library(knitr)
# Recommenderlab und ähnlich
library(recommenderlab)
library(vegan)
library(coop)
# Konfiguration der Pakete
knitr::opts_chunk$set(fit.align = "left", cache = TRUE, warning = FALSE, message = FALSE)
set.seed(100)
# Einlesen der CSV-Dateien und erstellen der samples
movies1 <- read.csv("ml-latest-small/movies.csv", sep = ",")
links1 <- read.csv("ml-latest-small/links.csv", sep = ",")
ratings1 <- read.csv("ml-latest-small/ratings.csv", sep = ",")
tags1 <- read.csv("ml-latest-small/tags.csv", sep = ",")
# Sample von 70%
set.seed(69)
movies2 <- movies1 %>% slice_sample(prop = 0.7)
links2 <- subset(links1, movieId %in% movies2$movieId)
ratings2 <- subset(ratings1, movieId %in% movies2$movieId) %>% slice_sample(prop = 0.7)
tags2 <- subset(tags1, movieId %in% movies2$movieId)
# 2ter Sample von 70%
set.seed(100)
movies1 <- movies1 %>% slice_sample(prop = 0.7)
links1 <- subset(links1, movieId %in% movies1$movieId)
ratings1 <- subset(ratings1, movieId %in% movies1$movieId) %>% slice_sample(prop = 0.7)
tags1 <- subset(tags1, movieId %in% movies1$movieId)
left_join(movies1, ratings1, "movieId") %>%
group_by(title, movieId, genres) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
head(3)
## # A tibble: 3 × 4
## # Groups: title, movieId [3]
## title movieId genres count
## <chr> <int> <chr> <int>
## 1 Forrest Gump (1994) 356 Comedy|Drama|Romance|… 234
## 2 Pulp Fiction (1994) 296 Comedy|Crime|Drama|Th… 200
## 3 Star Wars: Episode IV - A New Hope (1977) 260 Action|Adventure|Sci-… 181
left_join(movies2, ratings2, "movieId") %>%
group_by(title, movieId, genres) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
head(3)
## # A tibble: 3 × 4
## # Groups: title, movieId [3]
## title movieId genres count
## <chr> <int> <chr> <int>
## 1 Shawshank Redemption, The (1994) 318 Crime|Drama 228
## 2 Pulp Fiction (1994) 296 Comedy|Crime|Drama|Th… 219
## 3 Star Wars: Episode IV - A New Hope (1977) 260 Action|Adventure|Sci-… 182
In den beiden Outputs haben wir die Aufzählung der 3 meist bewerteten Filme, bei dem die Spalte ‘count’ die Anzahl Bewertungen ist. Die Top 3 Filme wurden bei beiden Datensätzen etwa 180 bis 240 mal bewertet.
Wir können nicht bestimmen, wie oft ein Film geschaut wurde, da es zu dieser Information keine Daten gibt. Als alternative definieren wir, dass geschaut und bewertet gleichgestellt wird. Die am meist geschauten/bewerteten Filme sind “Forrest Gump”, “Pulp Fiction”, “Star Wars: Episode IV - A New Hope” und “Shawshank Redemption”).
genres_sep1 <- movies1 %>%
separate_rows(genres, sep = "\\|", convert = FALSE) %>%
replace(. == "", "no genres listed")
genres_sep1 %>%
right_join(ratings1, "movieId") %>%
group_by(genres) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
head(3)
## # A tibble: 3 × 2
## genres count
## <chr> <int>
## 1 Drama 20803
## 2 Comedy 19432
## 3 Action 14383
genres_sep2 <- movies2 %>%
separate_rows(genres, sep = "\\|", convert = FALSE) %>%
replace(. == "", "no genres listed")
genres_sep2 %>%
right_join(ratings2, "movieId") %>%
group_by(genres) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
head(3)
## # A tibble: 3 × 2
## genres count
## <chr> <int>
## 1 Drama 20230
## 2 Comedy 20019
## 3 Action 14118
In beiden Outputs haben wir die meist bewerteten Filmgenres, bei dem die Spalte ‘count’ signalisiert, bei wie vielen Filmbewertungen der bewertete Film dieses Genre beinhaltet. Der Outputs ist bei beiden Datensätzen sehr ähnlich.
Die am meist geschauten/bewerteten Genres sind Drama, Comedy und Action.
# Gesamthaft
summary(ratings1$rating)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.500 3.000 3.500 3.487 4.000 5.000
ggplot(ratings1, aes(rating)) +
geom_bar() +
labs(
title = "Verteilung der Kundenratings",
x = "Bewertung",
y = "Anzahl Bewertungen",
subtitle = paste("Durchschnittsbewertung: ", mean(ratings1$rating))
) +
theme_classic() +
theme(legend.position = "none")
# Gesamthaft
summary(ratings2$rating)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.500 3.000 3.500 3.494 4.000 5.000
ggplot(ratings2, aes(rating)) +
geom_bar() +
labs(
title = "Verteilung der Kundenratings",
x = "Bewertung",
y = "Anzahl Bewertungen",
subtitle = paste("Durchschnittsbewertung: ", mean(ratings2$rating))
) +
theme_classic() +
theme(legend.position = "none")
In diesen Plots wird die Verteilung der Kundenratings visualisiert.
Die Kundenratings sind nicht ganz normalverteilt, aber nahe. Die meisten Bewertungen sind im Bereich der natürlichen Zahlen, wenige Bewertungen sind ein Wert zwischen zwei dieser Zahlen. Öfters enthält eine Bewertung den Wert 4. Der Durchschnitt aller Bewertungen liegt bei etwa 3,5.
# Nach Genres
genres_sep_ratings1 <- genres_sep1 %>%
right_join(ratings1, "movieId")
ggplot(genres_sep_ratings1, aes(x = rating, fill = genres)) +
geom_bar(aes(y = ..prop.., group = 1)) +
facet_wrap(~genres) +
labs(
title = "Verteilung der Kundenratings nach Genre",
x = "Bewertung",
y = "Verteilung",
) +
theme_classic() +
theme(legend.position = "none")
# Nach Genres
genres_sep_ratings2 <- genres_sep2 %>%
right_join(ratings2, "movieId")
ggplot(genres_sep_ratings2, aes(x = rating, fill = genres)) +
geom_bar(aes(y = ..prop.., group = 1)) +
facet_wrap(~genres) +
labs(
title = "Verteilung der Kundenratings nach Genre",
x = "Bewertung",
y = "Verteilung",
) +
theme_classic() +
theme(legend.position = "none")
Diese Plots sind ähnlich wie die letzten zwei. Hier wird die Verteilung der Kundenratings nach Kategorie visualisiert.
Die Verteilung der Kundenratings ähneln sich bei vielen Kategorien der Verteilung der Gesamtmenge. Jedoch mit einigen Ausnahmen: Dokumentarfilme haben zum Beispiel überdurchschnittlich viele Bewertungen mit dem Wert 4 und unterdurchschnittlich wenig Bewertungen mit dem Wert 3 und 5. Man könnte sagen, dass Dokumentarfilme sehr konstante Ratings haben.
mean_rating_movie1 <- ratings1 %>%
group_by(movieId) %>%
summarise(mean_rating = mean(rating), count = n())
ggplot(mean_rating_movie1, aes(mean_rating)) +
geom_histogram(bins = 50) +
labs(
title = "Verteilung der mittleren Kundenratings pro Film",
x = "Durchschnittliche Bewertung",
y = "Verteilung"
) +
theme_classic()
mean_rating_movie2 <- ratings2 %>%
group_by(movieId) %>%
summarise(mean_rating = mean(rating), count = n())
ggplot(mean_rating_movie2, aes(mean_rating)) +
geom_histogram(bins = 50) +
labs(
title = "Verteilung der mittleren Kundenratings pro Film",
x = "Durchschnittliche Bewertung",
y = "Verteilung"
) +
theme_classic()
Hier wird die Verteilung der Durchschnittswerte der Bewertungen nach Film visualisiert.
Da einige Filme nur wenige Bewertungen haben, liegen sehr viele Mittelwerte bei ganzen oder halben Zahlen. Deswegen gibt es bei unseren Plots einige hohe Balken.
ggplot(mean_rating_movie1 %>% filter(count >= 5), aes(mean_rating)) +
geom_histogram(bins = 50) +
labs(
title = "Verteilung der mittleren Kundenratings pro Film",
x = "Durchschnittliche Bewertung",
y = "Verteilung"
) +
theme_classic()
ggplot(mean_rating_movie2 %>% filter(count >= 5), aes(mean_rating)) +
geom_histogram(bins = 50) +
labs(
title = "Verteilung der mittleren Kundenratings pro Film",
x = "Durchschnittliche Bewertung",
y = "Verteilung"
) +
theme_classic()
Hier wird das gleiche wie beim letzten Plot visualisiert. Jedoch wurden Filme mit weniger als 5 Bewertungen entfernt.
Wenn man alle Filme mit weniger als 5 Bewertungen entfernt, erkennt man, dass die Bewertungen der Filme linksschief verteilt sind.
ggplot(mean_rating_movie1, aes(mean_rating, count, color = mean_rating)) +
geom_point(alpha = 0.3) +
labs(
title = "Verteilung der mittleren Kundenratings pro Film",
x = "Durchschnittliche Bewertung",
y = "Anzahl Bewertungen"
) +
theme_classic() +
scale_color_gradient(low = "red", high = "green") +
theme(legend.position = "none")
ggplot(mean_rating_movie2, aes(mean_rating, count, color = mean_rating)) +
geom_point(alpha = 0.3) +
labs(
title = "Verteilung der mittleren Kundenratings pro Film",
x = "Durchschnittliche Bewertung",
y = "Anzahl Bewertungen"
) +
theme_classic() +
scale_color_gradient(low = "red", high = "green") +
theme(legend.position = "none")
Hier wird die Verteilung der Durchschnittswerte der Bewertungen nach Film in Abhängigkeit von der Anzahl Bewertungen nach Film visualisiert.
Hier werden die gleichen Daten anders dargestellt. Man erkennt, dass desto öfters ein Film bewertet wird, desto näher liegt die durchschnittliche Bewertung bei 4. Man kann dies vielleicht begründen, indem man sagt, dass ein schlechter Film weniger geschaut und deswegen weniger bewertet wird. Jedoch können wir uns nur schwer erklären, wieso Filme mit einer Bewertung über 4 nicht so oft geschaut/bewertet werden.
sample_values <- sample(1:610, 4, replace = FALSE)
ratings1 %>%
filter(userId %in% sample_values) %>%
ggplot(., aes(rating)) +
geom_density(aes(color = factor(userId))) +
labs(
title = "Streuung von Bewertungen von Kunden",
subtitle = "random sample",
x = "Bewertung",
y = "Verteilung",
color = "User ID"
) +
theme_classic()
sample_values <- sample(1:610, 4, replace = FALSE)
ratings2 %>%
filter(userId %in% sample_values) %>%
ggplot(., aes(rating)) +
geom_density(aes(color = factor(userId))) +
labs(
title = "Streuung von Bewertungen von Kunden",
subtitle = "random sample",
x = "Bewertung",
y = "Verteilung",
color = "User ID"
) +
theme_classic()
sd_ratings1 <- ratings1 %>%
group_by(userId) %>%
summarise(SD = sd(rating), count = n())
ggplot(sd_ratings1, aes(SD, count, color = count)) +
geom_point() +
labs(
title = "Standardabweichung der Ratings pro User",
x = "Standardabweichung",
y = "Anzahl Ratings",
color = "Anzahl Ratings"
) +
theme_classic() +
scale_color_gradient(low = "green", high = "black") +
theme(legend.position = "none")
ggplot(sd_ratings1, aes(SD)) +
geom_boxplot() +
labs(
title = "Standardabweichung der Ratings pro User",
x = "Standardabweichung",
subtitle = paste("Durchschnittsstandardabweichung: ", mean(sd_ratings1$SD)),
) +
theme_classic()
sd_ratings2 <- ratings2 %>%
group_by(userId) %>%
summarise(SD = sd(rating), count = n())
ggplot(sd_ratings2, aes(SD, count, color = count)) +
geom_point() +
labs(
title = "Standardabweichung der Ratings pro User",
x = "Standardabweichung",
y = "Anzahl Ratings",
color = "Anzahl Ratings"
) +
theme_classic() +
scale_color_gradient(low = "green", high = "black") +
theme(legend.position = "none")
ggplot(sd_ratings2, aes(SD)) +
geom_boxplot() +
labs(
title = "Standardabweichung der Ratings pro User",
x = "Standardabweichung",
subtitle = paste("Durchschnittsstandardabweichung: ", mean(sd_ratings1$SD)),
) +
theme_classic()
In allen Plots werden die Standardabweichungen aller Bewertungen individueller User geplottet. In den Scatterplots wird zusätzlich die Anzahl Ratings an der y-Achse visualisiert.
Der Mittelwert der Standardabweichung der Ratings der User befindet sich um den Wert 0,9. Die Bewertungen streuen sich weniger als bei einer Normalverteilung.
norm_ratings1 <- ratings1 %>%
group_by(userId) %>%
summarise(mean_rating = mean(rating), sd_rating = sd(rating)) %>%
full_join(., ratings1, by = "userId")
norm_ratings1$z_rating <- (norm_ratings1$rating - norm_ratings1$mean_rating) /
norm_ratings1$sd_rating
ggplot(norm_ratings1, aes(z_rating)) +
geom_density() +
labs(
title = "Normierte Ratings",
x = "Z-Normiertes Rating",
y = "Verteilung"
) +
theme_classic()
sample_values1 <- sample(1:610, 4, replace = FALSE)
norm_ratings1 %>%
filter(userId %in% sample_values) %>%
ggplot(., aes(z_rating)) +
geom_density(aes(color = factor(userId))) +
labs(
title = "Normierte Ratings von Kunden",
subtitle = "random sample",
x = "Normierte Bewertung",
y = "Verteilung",
color = "User ID"
) +
theme_classic()
norm_ratings2 <- ratings2 %>%
group_by(userId) %>%
summarise(mean_rating = mean(rating), sd_rating = sd(rating)) %>%
full_join(., ratings2, by = "userId")
norm_ratings2$z_rating <- (norm_ratings2$rating - norm_ratings2$mean_rating) /
norm_ratings2$sd_rating
ggplot(norm_ratings2, aes(z_rating)) +
geom_density() +
labs(
title = "Normierte Ratings",
x = "Z-Normiertes Rating",
y = "Verteilung"
) +
theme_classic()
sample_values2 <- sample(1:610, 4, replace = FALSE)
norm_ratings2 %>%
filter(userId %in% sample_values) %>%
ggplot(., aes(z_rating)) +
geom_density(aes(color = factor(userId))) +
labs(
title = "Normierte Ratings von Kunden",
subtitle = "random sample",
x = "Normierte Bewertung",
y = "Verteilung",
color = "User ID"
) +
theme_classic()
In diesen Plots visualisieren wir zuerst die normierte Verteilung der Ratings von 4 zufällig gewählten User und danach visualisieren wir die normierte Verteilung der Ratings der Gesamtmenge.
Der Mittelwert der Bewertungen pro User befindet sich jetzt bei 0. Alle Ratings unter 0 könnte man als “gefällt dem User nicht” interpretieren und alle Rating über 0 könnte man als “gefällt dem User” interpretieren. Desto weiter sich die Bewertung von 0 entfernt desto mehr oder weniger gefällt dem User der Film.
user_item1 <- norm_ratings1 %>%
select(movieId, userId, z_rating) %>%
pivot_wider(names_from = movieId, values_from = z_rating)
sum(is.na(user_item1)) / (dim(user_item1)[1] * (dim(user_item1)[2]))
## [1] 0.9863269
user_item2 <- norm_ratings2 %>%
select(movieId, userId, z_rating) %>%
pivot_wider(names_from = movieId, values_from = z_rating)
sum(is.na(user_item2)) / (dim(user_item2)[1] * (dim(user_item2)[2]))
## [1] 0.9865087
Division der NA Werte durch die Anzahl Werte (NA & nicht NA).
Die User-Item Matrizen sind zu 98.6 % Sparse.
Die Daten wurden auf 400 Kunden und 700 Filme reduziert, indem Filme und Kunden mit sehr wenigen Ratings entfernt wurden
# Filter 700 most rated movies
top_n_movies1 <- norm_ratings1 %>%
group_by(movieId) %>%
count() %>%
arrange(desc(n)) %>%
head(700)
# Join data on 700 most rated movies
user_item_r1 <-
left_join(
top_n_movies1,
norm_ratings1,
by = "movieId"
)
# Filter 700 most rated user
top_n_user1 <- user_item_r1 %>%
group_by(userId) %>%
count() %>%
arrange(desc(n)) %>%
head(400) %>%
ungroup()
# Join data on 400 most rated user (only 700 movies)
user_item_r1 <-
left_join(
top_n_user1,
user_item_r1,
by = "userId"
) %>%
select(userId, movieId, z_rating)
# Pivot wider
m_user_item_r1 <- user_item_r1 %>%
pivot_wider(names_from = movieId, values_from = z_rating) %>%
column_to_rownames(., var = "userId")
# Filter 700 most rated movies
top_n_movies2 <- norm_ratings2 %>%
group_by(movieId) %>%
count() %>%
arrange(desc(n)) %>%
head(700)
# Join data on 700 most rated movies
user_item_r2 <-
left_join(
top_n_movies2,
norm_ratings2,
by = "movieId"
)
# Filter 700 most rated user
top_n_user2 <- user_item_r2 %>%
group_by(userId) %>%
count() %>%
arrange(desc(n)) %>%
head(400) %>%
ungroup()
# Join data on 400 most rated user (only 700 movies)
user_item_r2 <-
left_join(
top_n_user2,
user_item_r2,
by = "userId"
) %>%
select(userId, movieId, z_rating)
# Pivot wider
m_user_item_r2 <- user_item_r2 %>%
pivot_wider(names_from = movieId, values_from = z_rating) %>%
column_to_rownames(., var = "userId")
# Sparsity Sample 1
sum(is.na(m_user_item_r1)) / (dim(m_user_item_r1)[1] * (dim(m_user_item_r1)[2]))
## [1] 0.9009821
# Sparsity Sample 2
sum(is.na(m_user_item_r2)) / (dim(m_user_item_r2)[1] * (dim(m_user_item_r2)[2]))
## [1] 0.9030429
Hier wurden die Sparsities der neuen Matrizen berechnet.
Die Sparsity wurde deutlich reduziert. Anstatt 98.6% beträgt sie jetzt nur 90%
moviemeans_reducted1 <- colMeans(m_user_item_r1, na.rm = TRUE)
moviemeans_reducted1 <- data.frame(moviemeans_reducted1)
ggplot(moviemeans_reducted1, aes(moviemeans_reducted1)) +
geom_density() +
labs(
title = "Streuung von durchschnittlichen Bewertung von Filmen",
subtitle = "reduzierter Datensatz 1",
x = "durchschnittliche Bewertung",
y = "Verteilung"
) +
theme_classic() +
xlim(-2, 2)
moviemeans1 <- colMeans(user_item1 %>% column_to_rownames(., var = "userId"), na.rm = TRUE)
moviemeans1 <- data.frame(moviemeans1)
ggplot(moviemeans1, aes(moviemeans1)) +
geom_density() +
labs(
title = "Streuung von durchschnittlichen Bewertung von Filmen",
subtitle = "kompletter Datensatz 1",
x = "durchschnittliche Bewertung",
y = "Verteilung"
) +
theme_classic() +
xlim(-2, 2)
moviemeans_reducted2 <- colMeans(m_user_item_r2, na.rm = TRUE)
moviemeans_reducted2 <- data.frame(moviemeans_reducted2)
ggplot(moviemeans_reducted2, aes(moviemeans_reducted2)) +
geom_density() +
labs(
title = "Streuung von durchschnittlichen Bewertung von Filmen",
subtitle = "reduzierter Datensatz 2",
x = "durchschnittliche Bewertung",
y = "Verteilung"
) +
theme_classic() +
xlim(-2, 2)
moviemeans2 <- colMeans(user_item2 %>% column_to_rownames(., var = "userId"), na.rm = TRUE)
moviemeans2 <- data.frame(moviemeans2)
ggplot(moviemeans2, aes(moviemeans2)) +
geom_density() +
labs(
title = "Streuung von durchschnittlichen Bewertung von Filmen",
subtitle = "kompletter Datensatz 2",
x = "durchschnittliche Bewertung",
y = "Verteilung"
) +
theme_classic() +
xlim(-2, 2)
Hier wird die Streuung der durchschnittlichen Bewertung einzelner Filme visualisiert. Es wird dabei der reduzierte Datensatz mit dem kompletten Datensatz verglichen.
Man erkennt, dass die Daten beim reduzierten Datensatz grösstenteils nur im Bereich [-1, 1] streuen. Dies ist auch realistisch da es wahrscheinlicher ist, dass ein Film welches nur 1 Mal bewertet wurde eine Bewertung von z.B. -2 hat, als dass 10 User den gleichen Film so schlecht bewerten, dass der Durchschnitt bei -2 liegt.
intersection <- nrow(inner_join(user_item_r1, user_item_r2, by = c("movieId", "userId")))
union <- nrow(user_item_r1) + nrow(user_item_r2) - intersection
intersection / union
## [1] 0.294602
Die berechnete Zahl bezeichnet das Verhältnis von Bewertungen, welche in beiden Datensätzen vorhanden ist.
Die Schnittmenge der Bewertungen zwischen beiden Datensätzen beträgt etwa 30% der Gesamtmenge.
set.seed(69)
split1 <- initial_split(m_user_item_r1, prop = 0.80)
training1 <- as.matrix(training(split1))
test1 <- as.matrix(testing(split1))
set.seed(100)
split2 <- initial_split(m_user_item_r2, prop = 0.80)
training2 <- as.matrix(training(split2))
test2 <- as.matrix(testing(split2))
IBCF1 <- Recommender(as(training1, "realRatingMatrix"), "IBCF",
param = list(normalize = NULL, method = "cosine", k = 30, na_as_zero = TRUE, alpha = 0.5)
)
IBCF2 <- Recommender(as(training2, "realRatingMatrix"), "IBCF",
param = list(normalize = NULL, method = "cosine", k = 30)
)
# extract IBCF similarity matrix
IBCF_sim_matrix1 <- as.data.frame(as.matrix(IBCF1@model[["sim"]]))
# count number of occurrences
IBCF_freq1 <- as.data.frame(colSums(IBCF_sim_matrix1 != 0), optional = TRUE)
colnames(IBCF_freq1) <- "frequency"
ggplot(IBCF_freq1, aes(frequency)) +
geom_histogram(bins = 30)
# extract IBCF similarity matrix
IBCF_sim_matrix2 <- as.data.frame(as.matrix(IBCF2@model[["sim"]]))
# count number of occurrences
IBCF_freq2 <- as.data.frame(colSums(IBCF_sim_matrix2 != 0), optional = TRUE)
colnames(IBCF_freq2) <- "frequency"
ggplot(IBCF_freq2, aes(frequency)) +
geom_histogram(bins = 30)
# Sample 1
# Add movieId as column
IBCF_freq1$movieId <- rownames(IBCF_freq1)
# sort by frequency, select most frequent movies
IBCF_freq_head1 <- IBCF_freq1 %>%
arrange(desc(frequency)) %>%
head(30) %>%
convert(int(movieId))
# count occurrency and the mean rating of the reduced data
IBCF_freq_head1 <- left_join(IBCF_freq_head1, norm_ratings1, by = "movieId") %>%
group_by(movieId) %>%
summarise(
count = n(),
mean = mean(z_rating)
)
# Sample 2
# Add movieId as column
IBCF_freq2$movieId <- rownames(IBCF_freq2)
# sort by frequency, select most frequent movies
IBCF_freq_head2 <- IBCF_freq2 %>%
arrange(desc(frequency)) %>%
head(30) %>%
convert(int(movieId))
# count occurrency and the mean rating of the reduced data
IBCF_freq_head2 <- left_join(IBCF_freq_head2, norm_ratings2, by = "movieId") %>%
group_by(movieId) %>%
summarise(
count = n(),
mean = mean(z_rating)
)
# Sample 1
ggplot(IBCF_freq_head1, aes(count)) +
geom_histogram()
# Sample 2
ggplot(IBCF_freq_head2, aes(count)) +
geom_histogram()
# Sample 1
ggplot(IBCF_freq_head1, aes(mean)) +
geom_density()
# Sample 2
ggplot(IBCF_freq_head2, aes(mean)) +
geom_density()
calculate_jaccard <- function(arr1, arr2) {
# Check which columns are available
vals <- (!is.na(array(arr1)) & !is.na(array(arr2)))
# Remove movieId column from jaccard similarity
vals[1] <- FALSE
# If there are common not na values, calculate jac sim
if (sum(vals) != 0) {
both_true <- arr1[vals] & arr2[vals]
either_true <- arr1[vals] | arr2[vals]
jac_sim <- sum(both_true) / sum(either_true)
return(jac_sim)
}
# If not, return NA
return(NA)
}
calculate_cos <- function(arr1, arr2) {
# Check which columns are available
vals <- (!is.na(array(arr1)) & !is.na(array(arr2)))
# Remove movieId column from cos similarity
vals[1] <- FALSE
# If there are common not na values, calculate cos sim
if (sum(vals) != 0) {
arr1 <- arr1[vals]
arr2 <- arr2[vals]
ab <- crossprod(arr1, arr2)
norma <- norm(arr1, type = "2")
normb <- norm(arr2, type = "2")
cos_sim <- ((ab / (norma * normb)) + 1) / 2
return(cos_sim)
}
# If not, return NA
return(NA)
}
getCorrelationMatrix <- function(data, cos = TRUE) {
# Get array with movieId's
movies <- as.character(data$movieId)
# Create correlation matrix and set diag to 1
correlations <- matrix(
NA,
nrow = length(movies),
ncol = length(movies),
dimnames = list(movies, movies)
)
diag(correlations) <- 1
# Iterate through every movie and preload column
i_counter <- 0
for (i in movies) {
i_counter <- i_counter + 1
row_i <- data %>% filter(movieId == i)
# For every movie, iterate through every movie
j_counter <- 0
for (j in movies) {
j_counter <- j_counter + 1
# If cos similarity was already calculated, skip, else continue
if (i_counter <= j_counter) {
# calculate similarity
row_j <- data %>% filter(movieId == j)
if (cos) {
sim <- calculate_cos(row_i, row_j)
} else {
sim <- calculate_jaccard(row_i, row_j)
}
# set sim in sim matrix
correlations[i, j] <- sim
correlations[j, i] <- sim
}
}
# Track progress
# print(paste(i_counter, " Datasets done"))
}
# Return correlation matrix
return(correlations)
}
numToBool <- function(x) (x >= 0)
getCorrelationMatrixCosNoNA <- function(data, cos = TRUE) {
data[is.na(data)] <- 0
data <- t(data)
AAT <- data %*% t(data)
norm_ <- rep(NA, nrow(data))
for (i in 1:nrow(data)) {
norm_[i] <- sqrt(sum(data[i,]^2))
}
norms <- norm_ %*% t(norm_)
result <- AAT / norms
return((result + 1) / 2)
}
# Erstellung der User-Rating Matrix
set.seed(100)
sample_values1 <- sample(1:6819, 300, replace = FALSE)
norm_ratings1 <- ratings1 %>%
group_by(userId) %>%
summarise(mean_rating = mean(rating), sd_rating = sd(rating)) %>%
full_join(., ratings1, by = "userId")
norm_ratings1$z_rating <- (norm_ratings1$rating - norm_ratings1$mean_rating) /
norm_ratings1$sd_rating
item_user_random_100_1 <- norm_ratings1 %>%
select(movieId, userId, z_rating) %>%
pivot_wider(names_from = userId, values_from = z_rating) %>%
filter(movieId %in% sample_values1) %>%
head(100)
item_user_random_100_bool1 <- item_user_random_100_1 %>% mutate(across(!matches("movieId"), numToBool))
# Erstellung der User-Rating Like-Dislike Matrix
corrNumb1 <- getCorrelationMatrix(item_user_random_100_1, cos = TRUE)
corrBool1 <- getCorrelationMatrix(item_user_random_100_bool1, cos = FALSE)
# Erstellung der User-Rating Matrix
set.seed(100)
sample_values2 <- sample(1:6819, 300, replace = FALSE)
norm_ratings2 <- ratings2 %>%
group_by(userId) %>%
summarise(mean_rating = mean(rating), sd_rating = sd(rating)) %>%
full_join(., ratings2, by = "userId")
norm_ratings2$z_rating <- (norm_ratings2$rating - norm_ratings2$mean_rating) /
norm_ratings2$sd_rating
item_user_random_100_2 <- norm_ratings2 %>%
select(movieId, userId, z_rating) %>%
pivot_wider(names_from = userId, values_from = z_rating) %>%
filter(movieId %in% sample_values1) %>%
head(100)
item_user_random_100_bool2 <- item_user_random_100_2 %>% mutate(across(!matches("movieId"), numToBool))
# Erstellung der User-Rating Like-Dislike Matrix
corrNumb2 <- getCorrelationMatrix(item_user_random_100_2, cos = TRUE)
corrBool2 <- getCorrelationMatrix(item_user_random_100_bool2, cos = FALSE)
item_user_random_100_recommenderlab1 <- item_user_random_100_1 %>%
column_to_rownames(., var = "movieId") %>%
as.matrix(.) %>%
t(.)
corrNumbRL1 <- as.matrix(similarity(as(item_user_random_100_recommenderlab1, "realRatingMatrix"), method = "cosine", which = "items"))
corrNumbRL1[1:6, 1:6]
## 2478 3273 457 223 2366 3386
## 2478 NA 0.387965776 0.1878327 0.6764612 0.583930818 0.2678179
## 3273 0.3879658 NA 0.2105259 0.4300789 0.003323487 0.3019346
## 457 0.1878327 0.210525910 NA 0.5498116 0.506941205 0.6544472
## 223 0.6764612 0.430078886 0.5498116 NA 0.473340847 0.5391478
## 2366 0.5839308 0.003323487 0.5069412 0.4733408 NA 0.8759123
## 3386 0.2678179 0.301934611 0.6544472 0.5391478 0.875912271 NA
corrNumb1[1:6, 1:6]
## 2478 3273 457 223 2366 3386
## 2478 1.0000000 0.38796536 0.1878325 0.6764613 0.58393080 0.2678174
## 3273 0.3879654 1.00000000 0.2105260 0.4300789 0.00332348 0.3019346
## 457 0.1878325 0.21052604 1.0000000 0.5498115 0.50694134 0.6544472
## 223 0.6764613 0.43007890 0.5498115 1.0000000 0.47334097 0.5391479
## 2366 0.5839308 0.00332348 0.5069413 0.4733410 1.00000000 0.8759123
## 3386 0.2678174 0.30193458 0.6544472 0.5391479 0.87591235 1.0000000
item_user_random_100_bool_recommenderlab1 <- item_user_random_100_bool1 %>%
column_to_rownames(., var = "movieId") %>%
t(.)
corrBoolRL1 <- as.matrix(similarity(as(item_user_random_100_bool_recommenderlab1, "realRatingMatrix"), method = "jaccard", which = "items"))
corrBoolRL1[1:6, 1:6]
## 2478 3273 457 223 2366 3386
## 2478 NA 0.00 0.1428571 0.3333333 0.00 0.0000000
## 3273 0.0000000 NA 0.5000000 0.2500000 0.00 0.5000000
## 457 0.1428571 0.50 NA 0.6521739 0.40 0.5714286
## 223 0.3333333 0.25 0.6521739 NA 0.25 0.4285714
## 2366 0.0000000 0.00 0.4000000 0.2500000 NA 1.0000000
## 3386 0.0000000 0.50 0.5714286 0.4285714 1.00 NA
corrBool1[1:6, 1:6]
## 2478 3273 457 223 2366 3386
## 2478 1.0000000 0.00 0.1428571 0.3333333 0.00 0.0000000
## 3273 0.0000000 1.00 0.5000000 0.2500000 0.00 0.5000000
## 457 0.1428571 0.50 1.0000000 0.6521739 0.40 0.5714286
## 223 0.3333333 0.25 0.6521739 1.0000000 0.25 0.4285714
## 2366 0.0000000 0.00 0.4000000 0.2500000 1.00 NaN
## 3386 0.0000000 0.50 0.5714286 0.4285714 NaN 1.0000000
item_user_random_100_recommenderlab2 <- item_user_random_100_2 %>%
column_to_rownames(., var = "movieId") %>%
as.matrix(.) %>%
t(.)
corrNumbRL2 <- as.matrix(similarity(as(item_user_random_100_recommenderlab2, "realRatingMatrix"), method = "cosine", which = "items"))
corrNumbRL2[1:6, 1:6]
## 223 2529 2478 2329 3052 3273
## 223 NA 0.5379408 0.6370851 0.8101851 0.5567510 0.5837510
## 2529 0.5379408 NA 0.4592246 0.7173766 0.6348775 0.5151517
## 2478 0.6370851 0.4592246 NA 0.3956517 0.6179636 0.3160116
## 2329 0.8101851 0.7173766 0.3956517 NA 0.4738899 0.3890170
## 3052 0.5567510 0.6348775 0.6179636 0.4738899 NA 0.6549095
## 3273 0.5837510 0.5151517 0.3160116 0.3890170 0.6549095 NA
corrNumb2[1:6, 1:6]
## 223 2529 2478 2329 3052 3273
## 223 1.0000000 0.5379408 0.6370851 0.8101851 0.5567510 0.5837511
## 2529 0.5379408 1.0000000 0.4592246 0.7173767 0.6348775 0.5151518
## 2478 0.6370851 0.4592246 1.0000000 0.3956517 0.6179635 0.3160112
## 2329 0.8101851 0.7173767 0.3956517 1.0000000 0.4738899 0.3890170
## 3052 0.5567510 0.6348775 0.6179635 0.4738899 1.0000000 0.6549095
## 3273 0.5837511 0.5151518 0.3160112 0.3890170 0.6549095 1.0000000
item_user_random_100_bool_recommenderlab2 <- item_user_random_100_bool2 %>%
column_to_rownames(., var = "movieId") %>%
t(.)
corrBoolRL2 <- as.matrix(similarity(as(item_user_random_100_bool_recommenderlab2, "realRatingMatrix"), method = "jaccard", which = "items"))
corrBoolRL2[1:6, 1:6]
## 223 2529 2478 2329 3052 3273
## 223 NA 0.4166667 0.4000000 0.8235294 0.5263158 0.2500000
## 2529 0.4166667 NA 0.4000000 0.6250000 0.6250000 0.5000000
## 2478 0.4000000 0.4000000 NA 0.2500000 0.3333333 0.0000000
## 2329 0.8235294 0.6250000 0.2500000 NA 0.6111111 0.2222222
## 3052 0.5263158 0.6250000 0.3333333 0.6111111 NA 0.2000000
## 3273 0.2500000 0.5000000 0.0000000 0.2222222 0.2000000 NA
corrBool2[1:6, 1:6]
## 223 2529 2478 2329 3052 3273
## 223 1.0000000 0.4166667 0.4000000 0.8235294 0.5263158 0.2500000
## 2529 0.4166667 1.0000000 0.4000000 0.6250000 0.6250000 0.5000000
## 2478 0.4000000 0.4000000 1.0000000 0.2500000 0.3333333 0.0000000
## 2329 0.8235294 0.6250000 0.2500000 1.0000000 0.6111111 0.2222222
## 3052 0.5263158 0.6250000 0.3333333 0.6111111 1.0000000 0.2000000
## 3273 0.2500000 0.5000000 0.0000000 0.2222222 0.2000000 1.0000000
item_user_random_100_recommenderlab_NoNA1 <- item_user_random_100_recommenderlab1
item_user_random_100_recommenderlab_NoNA1[is.na(item_user_random_100_recommenderlab_NoNA1)] <- 0
coorNumbNoNA1 <- getCorrelationMatrixCosNoNA(item_user_random_100_recommenderlab_NoNA1)
corrNumbNoNARL1 <- as.matrix(similarity(as(item_user_random_100_recommenderlab_NoNA1, "realRatingMatrix"), method = "cosine", which = "items"))
corrNumbNoNARL1[1:6, 1:6]
## 2478 3273 457 223 2366 3386
## 2478 NA 0.4928292 0.4549946 0.5289497 0.5040260 0.4834155
## 3273 0.4928292 NA 0.4858222 0.4849278 0.4202126 0.4668097
## 457 0.4549946 0.4858222 NA 0.5131144 0.5008965 0.5193710
## 223 0.5289497 0.4849278 0.5131144 NA 0.4951498 0.5072474
## 2366 0.5040260 0.4202126 0.5008965 0.4951498 NA 0.5143400
## 3386 0.4834155 0.4668097 0.5193710 0.5072474 0.5143400 NA
coorNumbNoNA1[1:6, 1:6]
## 2478 3273 457 223 2366 3386
## 2478 1.0000000 0.4928292 0.4549946 0.5289497 0.5040260 0.4834155
## 3273 0.4928292 1.0000000 0.4858222 0.4849278 0.4202126 0.4668097
## 457 0.4549946 0.4858222 1.0000000 0.5131144 0.5008965 0.5193710
## 223 0.5289497 0.4849278 0.5131144 1.0000000 0.4951498 0.5072474
## 2366 0.5040260 0.4202126 0.5008965 0.4951498 1.0000000 0.5143400
## 3386 0.4834155 0.4668097 0.5193710 0.5072474 0.5143400 1.0000000
item_user_random_100_recommenderlab_NoNA2 <- item_user_random_100_recommenderlab2
item_user_random_100_recommenderlab_NoNA2[is.na(item_user_random_100_recommenderlab_NoNA2)] <- 0
coorNumbNoNA2 <- getCorrelationMatrixCosNoNA(item_user_random_100_recommenderlab_NoNA2)
corrNumbNoNARL2 <- as.matrix(similarity(as(item_user_random_100_recommenderlab_NoNA2, "realRatingMatrix"), method = "cosine", which = "items"))
corrNumbNoNARL2[1:6, 1:6]
## 223 2529 2478 2329 3052 3273
## 223 NA 0.5082019 0.5213117 0.5735067 0.5137183 0.5169022
## 2529 0.5082019 NA 0.4922180 0.5348368 0.5136068 0.5015267
## 2478 0.5213117 0.4922180 NA 0.4832771 0.5218560 0.4958499
## 2329 0.5735067 0.5348368 0.4832771 NA 0.4916889 0.4734474
## 3052 0.5137183 0.5136068 0.5218560 0.4916889 NA 0.5393864
## 3273 0.5169022 0.5015267 0.4958499 0.4734474 0.5393864 NA
coorNumbNoNA2[1:6, 1:6]
## 223 2529 2478 2329 3052 3273
## 223 1.0000000 0.5082019 0.5213117 0.5735067 0.5137183 0.5169022
## 2529 0.5082019 1.0000000 0.4922179 0.5348368 0.5136068 0.5015267
## 2478 0.5213117 0.4922179 1.0000000 0.4832771 0.5218559 0.4958499
## 2329 0.5735067 0.5348368 0.4832771 1.0000000 0.4916889 0.4734474
## 3052 0.5137183 0.5136068 0.5218559 0.4916889 1.0000000 0.5393864
## 3273 0.5169022 0.5015267 0.4958499 0.4734474 0.5393864 1.0000000
corrNumbC1 <- coop::cosine(item_user_random_100_recommenderlab1, use = "everything")
corrNumbC1[1:6, 1:6]
## 2478 3273 457 223 2366 3386
## 2478 1 NA NA NA NA NA
## 3273 NA 1 NA NA NA NA
## 457 NA NA 1 NA NA NA
## 223 NA NA NA 1 NA NA
## 2366 NA NA NA NA 1 NA
## 3386 NA NA NA NA NA 1
corrNumb1[1:6, 1:6]
## 2478 3273 457 223 2366 3386
## 2478 1.0000000 0.38796536 0.1878325 0.6764613 0.58393080 0.2678174
## 3273 0.3879654 1.00000000 0.2105260 0.4300789 0.00332348 0.3019346
## 457 0.1878325 0.21052604 1.0000000 0.5498115 0.50694134 0.6544472
## 223 0.6764613 0.43007890 0.5498115 1.0000000 0.47334097 0.5391479
## 2366 0.5839308 0.00332348 0.5069413 0.4733410 1.00000000 0.8759123
## 3386 0.2678174 0.30193458 0.6544472 0.5391479 0.87591235 1.0000000
corrBoolVG1 <- vegdist(item_user_random_100_bool_recommenderlab1 %>% t(.), method = "jaccard", na.rm = TRUE) %>%
as.matrix(.)
corrBoolVG1[1:6, 1:6]
## 2478 3273 457 223 2366 3386
## 2478 0.0000000 1.00 0.8571429 0.6666667 1.00 1.0000000
## 3273 1.0000000 0.00 0.5000000 0.7500000 1.00 0.5000000
## 457 0.8571429 0.50 0.0000000 0.3478261 0.60 0.4285714
## 223 0.6666667 0.75 0.3478261 0.0000000 0.75 0.5714286
## 2366 1.0000000 1.00 0.6000000 0.7500000 0.00 NaN
## 3386 1.0000000 0.50 0.4285714 0.5714286 NaN 0.0000000
corrBool1[1:6, 1:6]
## 2478 3273 457 223 2366 3386
## 2478 1.0000000 0.00 0.1428571 0.3333333 0.00 0.0000000
## 3273 0.0000000 1.00 0.5000000 0.2500000 0.00 0.5000000
## 457 0.1428571 0.50 1.0000000 0.6521739 0.40 0.5714286
## 223 0.3333333 0.25 0.6521739 1.0000000 0.25 0.4285714
## 2366 0.0000000 0.00 0.4000000 0.2500000 1.00 NaN
## 3386 0.0000000 0.50 0.5714286 0.4285714 NaN 1.0000000
corrNumbC2 <- coop::cosine(item_user_random_100_recommenderlab2, use = "everything")
corrNumbC2[1:6, 1:6]
## 223 2529 2478 2329 3052 3273
## 223 1 NA NA NA NA NA
## 2529 NA 1 NA NA NA NA
## 2478 NA NA 1 NA NA NA
## 2329 NA NA NA 1 NA NA
## 3052 NA NA NA NA 1 NA
## 3273 NA NA NA NA NA 1
corrNumb2[1:6, 1:6]
## 223 2529 2478 2329 3052 3273
## 223 1.0000000 0.5379408 0.6370851 0.8101851 0.5567510 0.5837511
## 2529 0.5379408 1.0000000 0.4592246 0.7173767 0.6348775 0.5151518
## 2478 0.6370851 0.4592246 1.0000000 0.3956517 0.6179635 0.3160112
## 2329 0.8101851 0.7173767 0.3956517 1.0000000 0.4738899 0.3890170
## 3052 0.5567510 0.6348775 0.6179635 0.4738899 1.0000000 0.6549095
## 3273 0.5837511 0.5151518 0.3160112 0.3890170 0.6549095 1.0000000
corrBoolVG2 <- vegdist(item_user_random_100_bool_recommenderlab2 %>% t(.), method = "jaccard", na.rm = TRUE) %>%
as.matrix(.)
corrBoolVG2[1:6, 1:6]
## 223 2529 2478 2329 3052 3273
## 223 0.0000000 0.5833333 0.6000000 0.1764706 0.4736842 0.7500000
## 2529 0.5833333 0.0000000 0.6000000 0.3750000 0.3750000 0.5000000
## 2478 0.6000000 0.6000000 0.0000000 0.7500000 0.6666667 1.0000000
## 2329 0.1764706 0.3750000 0.7500000 0.0000000 0.3888889 0.7777778
## 3052 0.4736842 0.3750000 0.6666667 0.3888889 0.0000000 0.8000000
## 3273 0.7500000 0.5000000 1.0000000 0.7777778 0.8000000 0.0000000
corrBool2[1:6, 1:6]
## 223 2529 2478 2329 3052 3273
## 223 1.0000000 0.4166667 0.4000000 0.8235294 0.5263158 0.2500000
## 2529 0.4166667 1.0000000 0.4000000 0.6250000 0.6250000 0.5000000
## 2478 0.4000000 0.4000000 1.0000000 0.2500000 0.3333333 0.0000000
## 2329 0.8235294 0.6250000 0.2500000 1.0000000 0.6111111 0.2222222
## 3052 0.5263158 0.6250000 0.3333333 0.6111111 1.0000000 0.2000000
## 3273 0.2500000 0.5000000 0.0000000 0.2222222 0.2000000 1.0000000
Die Korrelationsmatrix mit ordinalen Ratings scheint viel detailliertere Korrelationswerte zurückzugeben, da wir genaue Ratings der User haben. Da mit der Umwandlung zu binären Werten diese Informationen verloren gehen, sieht die Korrelationsmatrix mit binären Werten dementsprechend weniger hochauflösend aus.
# Sample 1
# predict IBCF
pIBCF1 <- predict(IBCF1, as(test1, "realRatingMatrix"), type = "topNList", n = 15)
# Sample 1
# calc frequency of predicted movies
freq_pred_IBCF1 <- table(unlist(as(pIBCF1, "list"))) %>%
as.data.frame() %>%
rename(movieId = Var1) %>%
arrange(desc(Freq))
# Sample 2
# predict IBCF
pIBCF2 <- predict(IBCF2, as(test2, "realRatingMatrix"), type = "topNList", n = 15)
# Sample 2
# calc frequency of predicted movies
freq_pred_IBCF2 <- table(unlist(as(pIBCF2, "list"))) %>%
as.data.frame() %>%
rename(movieId = Var1) %>%
arrange(desc(Freq))
# Sample 1
# train UBCF
UBCF1 <- Recommender(as(training1, "realRatingMatrix"), "UBCF",
param = list(normalize = NULL, method = "cosine", nn = 30)
)
# predict UBCF
pUBCF1 <- predict(UBCF1, as(test1, "realRatingMatrix"), type = "topNList", n = 15)
# Sample 1
# calc frequency of predicted movies
freq_pred_UBCF1 <- table(unlist(as(pUBCF1, "list"))) %>%
as.data.frame() %>%
rename(movieId = Var1) %>%
arrange(desc(Freq))
# Sample 2
# train UBCF
UBCF2 <- Recommender(as(training2, "realRatingMatrix"), "UBCF",
param = list(method = "cosine", nn = 30)
)
# predict UBCF
pUBCF2 <- predict(UBCF2, as(test2, "realRatingMatrix"), type = "topNList", n = 15)
# Sample 2
# calc frequency of predicted movies
freq_pred_UBCF2 <- table(unlist(as(pUBCF2, "list"))) %>%
as.data.frame() %>%
rename(movieId = Var1) %>%
arrange(desc(Freq))
# Sample 1
freq_pred_UBCF1$type <- "UBCF"
freq_pred_IBCF1$type <- "IBCF"
moviesUIBCF1 <- rbind(freq_pred_UBCF1, freq_pred_IBCF1)
# Sample 1
ggplot(moviesUIBCF1, aes(Freq, fill = type)) +
geom_histogram(alpha = 0.6, position = "dodge") +
scale_fill_manual(values = c("#69b3a2", "#404080"))
# Sample 2
freq_pred_UBCF2$type <- "UBCF"
freq_pred_IBCF2$type <- "IBCF"
moviesUIBCF2 <- rbind(freq_pred_UBCF2, freq_pred_IBCF2)
# Sample 2
ggplot(moviesUIBCF2, aes(Freq, fill = type)) +
geom_histogram(alpha = 0.6, position = "dodge") +
scale_fill_manual(values = c("#69b3a2", "#404080"))
Fakten: Der ICBF Recommender empfiehlt mehr unterschiedliche Filme. Der UCBF recommender empfiehlt bis zu 25 mal den gleichen Film. Schlussfolgerung:
UBCF_TOPN1 <- as.data.frame(as(pUBCF1, "matrix"))
UBCF_TOPN1$user <- rownames(UBCF_TOPN1)
UBCF_TOPN1 <- pivot_longer(UBCF_TOPN1, cols = -c(user), values_drop_na = TRUE)
IBCF_TOPN1 <- as.data.frame(as(pIBCF1, "matrix"))
IBCF_TOPN1$user <- rownames(IBCF_TOPN1)
IBCF_TOPN1 <- pivot_longer(IBCF_TOPN1, cols = -c(user), values_drop_na = TRUE)
IU_BCF_cosine_intersect1 <- left_join(IBCF_TOPN1, UBCF_TOPN1, by = c("user", "name"))
# count intersect
IU_BCF_cosine_intersect1 %>%
select(user, value.y) %>%
group_by(user) %>%
summarise(total_intersect = sum(!is.na(value.y))) %>%
ggplot(aes(total_intersect)) +
geom_histogram()
UBCF_TOPN2 <- as.data.frame(as(pUBCF2, "matrix"))
UBCF_TOPN2$user <- rownames(UBCF_TOPN2)
UBCF_TOPN2 <- pivot_longer(UBCF_TOPN2, cols = -c(user), values_drop_na = TRUE)
IBCF_TOPN2 <- as.data.frame(as(pIBCF2, "matrix"))
IBCF_TOPN2$user <- rownames(IBCF_TOPN2)
IBCF_TOPN2 <- pivot_longer(IBCF_TOPN2, cols = -c(user), values_drop_na = TRUE)
IU_BCF_cosine_intersect2 <- left_join(IBCF_TOPN2, UBCF_TOPN2, by = c("user", "name"))
# count intersect
IU_BCF_cosine_intersect2 %>%
select(user, value.y) %>%
group_by(user) %>%
summarise(total_intersect = sum(!is.na(value.y))) %>%
ggplot(aes(total_intersect)) +
geom_histogram()
# Sample 1
# Create binary training and test data
training_binary1 <- training1 > 0
training_binary1[is.na(training_binary1)] <- 0
test_binary1 <- test1 > 0
test_binary1[is.na(test_binary1)] <- 0
# Sample 2
# Create binary training and test data
training_binary2 <- training2 > 0
training_binary2[is.na(training_binary2)] <- 0
test_binary2 <- test2 > 0
test_binary2[is.na(test_binary2)] <- 0
# Sample 1
# Train and test binary UBCF-recommender
UBCF_binary1 <- Recommender(as(training_binary1, "realRatingMatrix"), "UBCF", param = list(normalize = NULL, method = "jaccard"))
pUBCF_binary1 <- predict(UBCF_binary1, as(test1, "realRatingMatrix"), type = "topNList", n = 15)
# Sample 2
# Train and test binary UBCF-recommender
UBCF_binary1 <- Recommender(as(training_binary1, "realRatingMatrix"), "UBCF", param = list(normalize = NULL, method = "jaccard"))
pUBCF_binary1 <- predict(UBCF_binary1, as(test1, "realRatingMatrix"), type = "topNList", n = 15)
# Sample 1
# Train and test binary IBCF-recommender
IBCF_binary1 <- Recommender(as(training_binary1, "realRatingMatrix"), "IBCF", param = list(normalize = NULL, method = "jaccard"))
pIBCF_binary1 <- predict(IBCF_binary1, as(test1, "realRatingMatrix"), type = "topNList", n = 15)
# Sample 2
# Train and test binary IBCF-recommender
IBCF_binary2 <- Recommender(as(training_binary2, "realRatingMatrix"), "UBCF", param = list(normalize = NULL, method = "jaccard"))
pIBCF_binary2 <- predict(IBCF_binary1, as(test2, "realRatingMatrix"), type = "topNList", n = 15)